{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from pandas.tseries.offsets import *\n",
    "from tqdm import tqdm\n",
    "from functools import reduce\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\", category=DeprecationWarning)\n",
    "warnings.filterwarnings(\"ignore\", category=RuntimeWarning, module=\"numpy\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CRSP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "crsp = pd.read_parquet('../data/WRDS/crsp_m.parquet')\n",
    "crsp['prc'] = abs(crsp['prc'])\n",
    "crsp['ME'] = (crsp['prc']) * crsp['shrout']\n",
    "crsp.sort_values(by=['permno','YearMonth'], inplace=True)\n",
    "crsp['bh1m'] = crsp.groupby('permno')['retadj'].shift(-1)\n",
    "crsp['prc_l1'] = crsp.groupby('permno')['prc'].shift(1)\n",
    "crsp.duplicated(subset=['permno','YearMonth']).sum()\n",
    "crsp.rename(columns={'permno':'PERMNO'}, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "crsp.duplicated(subset=['PERMNO','YearMonth']).sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# IBES"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### IBES summary actual ###\n",
    "## CRSP-IBES link table\n",
    "iclink = pd.read_csv('../data/WRDS/iclink_WRDS.csv')\n",
    "iclink['sdate'] = pd.to_datetime(iclink['sdate'])\n",
    "iclink['edate'] = pd.to_datetime(iclink['edate'])\n",
    "iclink.dropna(subset=['PERMNO'], inplace=True)\n",
    "\n",
    "# PERMNO -- > GVKEY\n",
    "ccm = pd.read_parquet('../data/WRDS/ccm.parquet')\n",
    "# if linkenddt is missing then set to today date\n",
    "ccm['linkenddt']=ccm['linkenddt'].fillna(pd.to_datetime('today'))\n",
    "ccm.rename(columns={'permno':'PERMNO'}, inplace=True)\n",
    "ccm['gvkey'] = ccm['gvkey'].astype(float)\n",
    "\n",
    "compustat_q = pd.read_csv('../data/WRDS/comp_fundq.csv',\n",
    "                          usecols=['gvkey','datadate','rdq',\n",
    "                                   ],\n",
    "                        #   nrows=20\n",
    "                         )\n",
    "compustat_q['PENDS'] = pd.to_datetime(compustat_q['datadate'],format='%Y%m%d')\n",
    "compustat_q['ANNDATS'] = pd.to_datetime(compustat_q['rdq'], format='%Y%m%d')\n",
    "\n",
    "# actual earnings from IBES summary unadjusted\n",
    "AE = pd.read_parquet('../data/WRDS/Actual_EPS_summary_unadjusted_2023.parquet')\n",
    "AE = AE[(AE['CUSIP']!='') & (AE['CUSIP']!='00000000')]\n",
    "AE['YearMonth'] = pd.to_datetime(AE['STATPERS']) + MonthEnd(0)\n",
    "\n",
    "# TICKER --> PERMNO\n",
    "AE = AE.merge(iclink[['TICKER','PERMNO','NCUSIP','sdate','edate']],\n",
    "         on=['TICKER'],\n",
    "        )\n",
    "AE = AE[(AE['STATPERS']>=AE['sdate']) \\\n",
    "        & (AE['STATPERS']<=AE['edate']) \\\n",
    "        & (AE['CUSIP']==AE['NCUSIP'])\n",
    "        ]\n",
    "\n",
    "# PERMNO --> GVKEY\n",
    "AE = AE.merge(ccm, on=['PERMNO'])\n",
    "AE = AE[(AE['YearMonth']>=AE['linkdt']) & (AE['YearMonth']<=AE['linkenddt'])]\n",
    "\n",
    "AE['FY0EDATS'] = pd.to_datetime(AE['FY0EDATS'], format='%Y-%m-%d')\n",
    "AE['INT0DATS'] = pd.to_datetime(AE['INT0DATS'], format='%Y-%m-%d')\n",
    "\n",
    "# GVKEY --> rdq (ANNDATS)\n",
    "AE_A = AE[['TICKER','PERMNO','gvkey','FY0EDATS','FY0A']]\\\n",
    "        .rename(columns={'FY0EDATS':'PENDS','FY0A':'EPS_true'})\\\n",
    "        .drop_duplicates(subset=['TICKER','PENDS'])\n",
    "AE_A = AE_A.merge(compustat_q[['gvkey','PENDS','ANNDATS']],\n",
    "           on=['gvkey','PENDS'], how='left'\n",
    "           )\n",
    "AE_A['PDICITY'] = 'ANN'\n",
    "\n",
    "# quarterly actual AE\n",
    "AE_Q = AE[['TICKER','PERMNO','gvkey','INT0DATS','INT0A']]\\\n",
    "        .rename(columns={'INT0DATS':'PENDS','INT0A':'EPS_true'})\\\n",
    "        .drop_duplicates(subset=['TICKER','PENDS'])\n",
    "AE_Q = AE_Q.merge(compustat_q[['gvkey','PENDS','ANNDATS']],\n",
    "              on=['gvkey','PENDS'], how='left'\n",
    "              )\n",
    "AE_Q['PDICITY'] = 'QTR'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "AE_summary = pd.concat([AE_A[['TICKER','PENDS','EPS_true','ANNDATS','PDICITY']],\n",
    "           AE_Q[['TICKER','PENDS','EPS_true','ANNDATS','PDICITY']]\n",
    "           ], axis=0)\n",
    "\n",
    "AE_summary.to_parquet('../data/Results/AE_summary.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "AE_summary = pd.read_parquet('../data/Results/AE_summary.parquet')\n",
    "## Full IBES actual\n",
    "EPS_true = pd.read_parquet('../data/WRDS/Actual_EPS_detail_unadjusted_2023.parquet')\n",
    "EPS_true['EPS_true'] = EPS_true['VALUE']\n",
    "EPS_true['PENDS'] = pd.to_datetime(EPS_true['PENDS'], format='%Y-%m-%d')\n",
    "EPS_true['ANNDATS'] = pd.to_datetime(EPS_true['ANNDATS'], format='%Y-%m-%d')\n",
    "EPS_true = EPS_true[['TICKER', 'PENDS', 'EPS_true', 'ANNDATS', 'PDICITY']].copy()\n",
    "\n",
    "# V1: combine detail and summary\n",
    "EPS_true = pd.concat([EPS_true, AE_summary], axis=0)\n",
    "EPS_true.dropna(subset=['EPS_true','ANNDATS'], inplace=True)\n",
    "EPS_true.drop_duplicates(subset=['TICKER','PENDS','PDICITY'], inplace=True)\n",
    "\n",
    "# V2: fill ANADATS by IBES detail actual\n",
    "# EPS_true = AE_summary.merge(EPS_true[['TICKER','PENDS','PDICITY',\n",
    "#                                       'ANNDATS','EPS_true'\n",
    "#                                       ]].rename(columns={'ANNDATS':'ANNDATS_detail',\n",
    "#                                                          'EPS_true':'EPS_true_detail'\n",
    "#                                                         }),\n",
    "#                             on=['TICKER','PENDS','PDICITY'],\n",
    "#                             how='left'\n",
    "#                             )\n",
    "# EPS_true['ANNDATS'] = EPS_true['ANNDATS'].fillna(EPS_true['ANNDATS_detail'])\n",
    "# EPS_true['EPS_true'] = EPS_true['EPS_true'].fillna(EPS_true['EPS_true_detail'])\n",
    "# EPS_true.drop(columns=['ANNDATS_detail','EPS_true_detail'], inplace=True)\n",
    "# EPS_true.dropna(subset=['EPS_true','ANNDATS'], inplace=True)\n",
    "# EPS_true.drop_duplicates(subset=['TICKER','PENDS','PDICITY'], inplace=True)\n",
    "# print(AE_summary.shape, EPS_true.shape)\n",
    "\n",
    "# V3: only use IBES detail actual\n",
    "# EPS_true = AE_summary.copy()\n",
    "\n",
    "# V4: only use IBES summary actual\n",
    "# EPS_true = EPS_true.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Last EPS\n",
    "EPS_true_qtr = EPS_true[EPS_true['PDICITY'] == 'QTR'].sort_values(by=['TICKER','PENDS'])\n",
    "EPS_true_ann = EPS_true[EPS_true['PDICITY'] == 'ANN'].sort_values(by=['TICKER','PENDS'])\n",
    "\n",
    "EPS_true_qtr['EPS_true_l1'] = EPS_true_qtr.groupby('TICKER')['EPS_true'].shift(1)\n",
    "EPS_true_qtr['ANNDATS_l1'] = EPS_true_qtr.groupby('TICKER')['ANNDATS'].shift(1)\n",
    "\n",
    "EPS_true_ann['EPS_true_l1'] = EPS_true_ann.groupby('TICKER')['EPS_true'].shift(1)\n",
    "EPS_true_ann['ANNDATS_l1'] = EPS_true_ann.groupby('TICKER')['ANNDATS'].shift(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## IBES consensus\n",
    "consensus = pd.read_parquet('../data/WRDS/Forecast_EPS_summary_unadjusted_2023.parquet')\n",
    "consensus['STATPERS'] = pd.to_datetime(consensus['STATPERS'], format='%Y-%m-%d')\n",
    "consensus['FPEDATS'] = pd.to_datetime(consensus['FPEDATS'], format='%Y-%m-%d')\n",
    "consensus['YearMonth'] = consensus['STATPERS'] + MonthEnd(0)\n",
    "consensus['EPS_ana'] = consensus['MEANEST']\n",
    "\n",
    "# Merge with Actual\n",
    "consensus_quarter = consensus[consensus.FPI.isin(['6','7','8'])].copy()\n",
    "consensus_annual = consensus[consensus.FPI.isin(['1','2'])].copy()\n",
    "\n",
    "consensus_quarter = consensus_quarter.merge(EPS_true_qtr[['TICKER','PENDS','EPS_true','ANNDATS','ANNDATS_l1','EPS_true_l1']],\n",
    "                                            left_on=['TICKER','FPEDATS'],\n",
    "                                            right_on=['TICKER','PENDS'],\n",
    "                                            how='left'\n",
    "                                           )\n",
    "\n",
    "consensus_annual = consensus_annual.merge(EPS_true_ann[['TICKER','PENDS','EPS_true','ANNDATS','ANNDATS_l1','EPS_true_l1']],\n",
    "                                            left_on=['TICKER','FPEDATS'],\n",
    "                                            right_on=['TICKER','PENDS'],\n",
    "                                            how='left'\n",
    "                                         )\n",
    "\n",
    "consensus = pd.concat([consensus_quarter, consensus_annual], axis=0)\n",
    "\n",
    "# ICLINK\n",
    "consensus = consensus.merge(iclink[['TICKER','PERMNO','sdate','edate','NCUSIP']],\n",
    "                           on=['TICKER'])\n",
    "consensus = consensus[(consensus['STATPERS']>=consensus['sdate']) & (consensus['STATPERS']<=consensus['edate'])]\n",
    "consensus = consensus[consensus['CUSIP'] == consensus['NCUSIP']]\n",
    "consensus = consensus.drop(columns=['sdate','edate','NCUSIP'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Adjusted Actual\n",
    "def process_consensus(FPI, consensus, crsp):\n",
    "    df = consensus[consensus['FPI']==FPI][['TICKER','PERMNO','STATPERS','EPS_ana',\n",
    "                                        'EPS_true','EPS_true_l1','ANNDATS','ANNDATS_l1',\n",
    "                                        ]].drop_duplicates(subset=['TICKER','STATPERS']).copy()\n",
    "\n",
    "    # ANN month cfacshr\n",
    "    df['ANN_m'] = df['ANNDATS'] + MonthEnd(0)\n",
    "    df = df.merge(crsp[['PERMNO','YearMonth','cfacshr']],\n",
    "                left_on=['PERMNO','ANN_m'],\n",
    "                right_on=['PERMNO','YearMonth'],\n",
    "                )\n",
    "    df['EPS_true'] = df['EPS_true']/df['cfacshr']\n",
    "    df.drop(columns=['YearMonth','cfacshr'], inplace=True)\n",
    "\n",
    "    # Last ANN month cfacshr\n",
    "    df['ANN_m'] = df['ANNDATS_l1'] + MonthEnd(0)\n",
    "    df = df.merge(crsp[['PERMNO','YearMonth','cfacshr']],\n",
    "                left_on=['PERMNO','ANN_m'],\n",
    "                right_on=['PERMNO','YearMonth'],\n",
    "                how='left'\n",
    "                )\n",
    "    df['EPS_true_l1'] = df['EPS_true_l1']/df['cfacshr']\n",
    "    df.drop(columns=['YearMonth','cfacshr','ANN_m'], inplace=True)\n",
    "\n",
    "    df['YearMonth'] = df['STATPERS'] + MonthEnd(0)\n",
    "\n",
    "    df = df.merge(crsp[['PERMNO','YearMonth','cfacshr']], on=['PERMNO','YearMonth'])\n",
    "\n",
    "    # Adjust to today cfacshr\n",
    "    df['EPS_true'] = df['EPS_true'] * df['cfacshr']\n",
    "    df['EPS_true_l1'] = df['EPS_true_l1'] * df['cfacshr']\n",
    "    df.drop(columns=['cfacshr'], inplace=True)\n",
    "\n",
    "    return df\n",
    "\n",
    "q1 = process_consensus('6', consensus, crsp)\n",
    "q2 = process_consensus('7', consensus, crsp)\n",
    "q3 = process_consensus('8', consensus, crsp)\n",
    "y1 = process_consensus('1', consensus, crsp)\n",
    "y2 = process_consensus('2', consensus, crsp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Rename columns and Merge all horizon\n",
    "q1.rename(columns={'EPS_true_l1':'EPS_true_l1_q1',\n",
    "                   'EPS_true':'EPS_true_q1',\n",
    "                   'EPS_ana':'EPS_ana_q1',\n",
    "                   'ANNDATS':'ANNDATS_q1','ANNDATS_l1':'ANNDATS_l1_q1',\n",
    "                  },\n",
    "          inplace=True)\n",
    "q2.rename(columns={'EPS_true_l1':'EPS_true_l1_q2',\n",
    "                   'EPS_true':'EPS_true_q2',\n",
    "                   'EPS_ana':'EPS_ana_q2',\n",
    "                   'ANNDATS':'ANNDATS_q2','ANNDATS_l1':'ANNDATS_l1_q2',\n",
    "                  },\n",
    "            inplace=True)\n",
    "q3.rename(columns={'EPS_true_l1':'EPS_true_l1_q3',\n",
    "                   'EPS_true':'EPS_true_q3',\n",
    "                   'EPS_ana':'EPS_ana_q3',\n",
    "                   'ANNDATS':'ANNDATS_q3','ANNDATS_l1':'ANNDATS_l1_q3',\n",
    "                  },\n",
    "            inplace=True)\n",
    "y1.rename(columns={'EPS_true_l1':'EPS_true_l1_y1',\n",
    "                   'EPS_true':'EPS_true_y1',\n",
    "                   'EPS_ana':'EPS_ana_y1',\n",
    "                   'ANNDATS':'ANNDATS_y1','ANNDATS_l1':'ANNDATS_l1_y1',\n",
    "                  },\n",
    "            inplace=True)\n",
    "y2.rename(columns={'EPS_true_l1':'EPS_true_l1_y2',\n",
    "                   'EPS_true':'EPS_true_y2',\n",
    "                   'EPS_ana':'EPS_ana_y2',\n",
    "                   'ANNDATS':'ANNDATS_y2','ANNDATS_l1':'ANNDATS_l1_y2',\n",
    "                  },\n",
    "            inplace=True)\n",
    "\n",
    "ana_all = reduce(lambda x,y: pd.merge(x,y,\n",
    "                                       on=['PERMNO','YearMonth'],\n",
    "                                       how='outer'),\n",
    "                 [q1.drop(columns=['TICKER']),\n",
    "                  q2.drop(columns=['TICKER','STATPERS']),\n",
    "                  q3.drop(columns=['TICKER','STATPERS']),\n",
    "                  y1.drop(columns=['TICKER','STATPERS']),\n",
    "                  y2.drop(columns=['TICKER','STATPERS',])])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = ana_all.merge(crsp[['PERMNO','YearMonth','siccd',\n",
    "                         'ret', 'prc', 'bh1m', 'shrout', 'ME','prc_l1',\n",
    "                        #  'shrcd', 'exchcd'\n",
    "                        ]],\n",
    "                   on=['PERMNO','YearMonth'],\n",
    "                   )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.duplicated(subset=['PERMNO','YearMonth']).sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Financial Ratios"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#### Financial Ratios ####\n",
    "ratios = pd.read_stata('../data/WRDS/financial_ratio.dta')\n",
    "ratios['public_date'] = ratios['public_date'] + MonthEnd(0)\n",
    "ratios['gvkey'] = ratios['gvkey'].astype(float)\n",
    "\n",
    "#### COMUPSTAT ####\n",
    "compa = pd.read_parquet('../data/WRDS/compa.parquet')\n",
    "compa['gvkey'] = compa['gvkey'].astype(float)\n",
    "\n",
    "## SIC code from Compustat\n",
    "ratios = ratios.merge(compa[['gvkey','datadate','sich']],\n",
    "                     left_on = ['gvkey','adate'],\n",
    "                     right_on = ['gvkey','datadate'],\n",
    "                     how = 'left'\n",
    "                    )\n",
    "ratios.drop_duplicates(subset=['permno','public_date'], inplace=True)\n",
    "\n",
    "df = df.merge(ratios.rename(columns={'permno':'PERMNO','public_date':'YearMonth'}),\n",
    "              on=['PERMNO','YearMonth'],\n",
    "              how = 'left'\n",
    "             )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.duplicated(subset=['PERMNO','YearMonth']).sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## FF49 Industry\n",
    "def zip_2_list(x):\n",
    "    tmp = []\n",
    "    for i, j in x:\n",
    "        tmp += list(range(i,j))\n",
    "    return tmp\n",
    "\n",
    "def fama_industry(sic, fama10):\n",
    "    for i in fama10.index:\n",
    "        if sic in fama10[i]:\n",
    "            return i\n",
    "    ## others\n",
    "    return 49\n",
    "\n",
    "# If sich is missing, use siccd from CRSP\n",
    "df['sic'] = np.where(df['sich'].isna(), df['siccd'], df['sich'])\n",
    "df['sic'] = df['sic'].astype(int)\n",
    "\n",
    "fama49 = pd.read_csv('../data/Other/Siccodes49.csv')\n",
    "fama49 = fama49.groupby('ff49').apply(lambda x: zip_2_list(zip(x.sic1, x.sic2+1)))\n",
    "\n",
    "_sic = df['sic'].unique()\n",
    "_sicff = pd.DataFrame(_sic).rename(columns={0:'sic'})\n",
    "_sicff['fama49'] = _sicff['sic'].apply(lambda x: fama_industry(x,fama49))\n",
    "\n",
    "df = pd.merge(df, _sicff, how='left', on=['sic'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.duplicated(subset=['PERMNO','YearMonth']).sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Fill NA with Industry Median\n",
    "## preprocess\n",
    "ratio_chars = ['CAPEI', 'bm',\n",
    "       'evm', 'pe_exi', 'pe_inc', 'ps', 'pcf',\n",
    "       'dpr', 'npm', 'opmbd', 'opmad', 'gpm', 'ptpm', 'cfm', 'roa', 'roe',\n",
    "       'roce', 'efftax', 'aftret_eq', 'aftret_invcapx', 'aftret_equity',\n",
    "       'pretret_noa', 'pretret_earnat', 'GProf', 'equity_invcap',\n",
    "       'debt_invcap', 'totdebt_invcap', 'capital_ratio', 'int_debt',\n",
    "       'int_totdebt', 'cash_lt', 'invt_act', 'rect_act', 'debt_at',\n",
    "       'debt_ebitda', 'short_debt', 'curr_debt', 'lt_debt', 'profit_lct',\n",
    "       'ocf_lct', 'cash_debt', 'fcf_ocf', 'lt_ppent', 'dltt_be', 'debt_assets',\n",
    "       'debt_capital', 'de_ratio', 'intcov', 'intcov_ratio', 'cash_ratio',\n",
    "       'quick_ratio', 'curr_ratio', 'cash_conversion', 'inv_turn', 'at_turn',\n",
    "       'rect_turn', 'pay_turn', 'sale_invcap', 'sale_equity', 'sale_nwc',\n",
    "       'rd_sale', 'adv_sale', 'staff_sale', 'accrual', 'ptb', 'PEG_trailing',\n",
    "       'divyield']\n",
    "\n",
    "## XX per share characteristics: IN Online Appendix A.2, BHL states that they \"consider another twenty-six\n",
    "# fundamental values per share derived from these financial ratios\"\n",
    "# We recover these features from their persudo-data shared in RFS code & data\n",
    "# See the data they shared: \"/Earnings Forecasts/SampleFigure1.csv\". Columns 'BU' to 'CR', totaling 24\n",
    "# I add \"sales_p\" & \"invcap_p\" to make it 26\n",
    "per_share_chars = ['dividend_p','BE_p','Liability_p','cur_liability_p','LT_debt_p',\n",
    "                  'cash_p', 'total_asset_p', 'tot_debt_p', 'accrual_p', 'EBIT_p',\n",
    "                   'cur_asset_p', 'pbda_p', 'ocf_p', 'inventory_p', 'receivables_p',\n",
    "                   'Cur_debt_p', 'interest_p', 'fcf_ocf_p', 'evm_p',\n",
    "                   'sales_p', 'invcap_p', 'c_equity_p', 'rd_p', 'opmad_p', 'gpm_p','ptpm_p'\n",
    "                  ]\n",
    "\n",
    "df['dividend_p'] = df['divyield'] * df['prc']\n",
    "df['BE_p'] = df['bm'] * df['prc'] # book-equity\n",
    "df['Liability_p'] = df['de_ratio'] * df['BE_p'] # Total Debt\n",
    "df['cur_liability_p'] = df['curr_debt'] * df['Liability_p']\n",
    "df['LT_debt_p'] = df['lt_debt'] * df['Liability_p']\n",
    "df['cash_p'] = df['cash_lt'] * df['Liability_p']\n",
    "df['total_asset_p'] = df['Liability_p'] / df['debt_at']\n",
    "df['tot_debt_p'] = df['debt_assets'] * df['total_asset_p']\n",
    "df['accrual_p'] = df['accrual'] * df['total_asset_p']\n",
    "df['EBIT_p'] = df['debt_ebitda'] / df['tot_debt_p']\n",
    "df['cur_asset_p'] = df['curr_ratio']*df['cur_liability_p']\n",
    "df['pbda_p'] = df['profit_lct'] * df['cur_liability_p'] # Operating Income before D&A\n",
    "df['ocf_p'] = df['ocf_lct'] * df['cur_liability_p'] # Operating Cash Flow\n",
    "df['inventory_p'] = df['invt_act'] * df['cur_asset_p']\n",
    "df['receivables_p'] = df['rect_act'] * df['cur_asset_p']\n",
    "df['Cur_debt_p'] = df['short_debt'] * df['total_asset_p'] # Short-term Debt\n",
    "df['interest_p'] = df['int_totdebt'] * df['tot_debt_p']\n",
    "df['fcf_ocf_p'] = df['fcf_ocf'] * df['ocf_p'] # Free Cash Flow\n",
    "df['evm_p'] = df['evm'] * df['EBIT_p'] # Multiple of Enterprise Value\n",
    "\n",
    "## ADD by YANDI ##\n",
    "df['sales_p'] = df['sale_equity'] * df['BE_p'] # Sales\n",
    "df['invcap_p'] = df['debt_invcap'] / df['LT_debt_p'] # Invested Capital\n",
    "\n",
    "## Recover theirs\n",
    "df['c_equity_p'] = df['equity_invcap'] * df['invcap_p'] # Common Equity\n",
    "df['rd_p'] = df['rd_sale'] * df['sales_p'] # R&D\n",
    "df['opmad_p'] = df['opmad'] * df['sales_p'] # Operating Income After Depreciation\n",
    "df['gpm_p'] = df['gpm']  * df['sales_p'] # Gross Profit\n",
    "df['ptpm_p'] = df['ptpm']  * df['sales_p'] # Pretax Income\n",
    "\n",
    "df.replace([-np.inf, np.inf], np.nan, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Fill NA with Industry Median\n",
    "for v in tqdm(ratio_chars+per_share_chars):\n",
    "    df[v] = df.groupby(['YearMonth','fama49'], group_keys=False)[v]\\\n",
    "              .apply(lambda x: x.fillna(x.median()) if not x.isna().all() else x)\n",
    "\n",
    "## In case some characteristics are all NA in some industry\n",
    "for v in tqdm(ratio_chars+per_share_chars):\n",
    "    df[v] = df.groupby(['YearMonth'], group_keys=False)[v].apply(lambda x: x.fillna(x.median()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Macro Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Macro Data\n",
    "RGDP = pd.read_excel('../data/Macro/RGDP.xlsx').set_index('DATE')\n",
    "RGDP = RGDP.apply(lambda x: np.log(x.dropna()).diff().iloc[-1], axis=0)\n",
    "RGDP.index = pd.date_range(start='1965-11', end='2024-04', freq='ME')\n",
    "\n",
    "RCON = pd.read_excel('../data/Macro/RCON.xlsx').set_index('DATE')\n",
    "RCON = RCON.apply(lambda x: np.log(x.dropna()).diff().iloc[-1], axis=0)\n",
    "RCON.index = pd.date_range(start='1965-11', end='2024-04', freq='ME')\n",
    "\n",
    "INDPROD = pd.read_excel('../data/Macro/INDPROD.xlsx').set_index('DATE')\n",
    "INDPROD = INDPROD.apply(lambda x: np.log(x.dropna()).diff().iloc[-1], axis=0)\n",
    "INDPROD.index = pd.date_range(start='1962-11', end='2024-03', freq='ME')\n",
    "\n",
    "UNEMP = pd.read_excel('../data/Macro/UNEMP.xlsx').set_index('DATE')\n",
    "UNEMP = UNEMP['RUC24Q1'].dropna()\n",
    "UNEMP.index = pd.date_range(start='1948-01', end='2024-02', freq='ME')\n",
    "## LAG one month, we can only observe last month UNEMP\n",
    "UNEMP = UNEMP.shift(1)\n",
    "\n",
    "macro = pd.DataFrame({'RGDP':RGDP,'RCON':RCON,'INDPROD':INDPROD,'UNEMP':UNEMP})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.merge(macro, left_on='YearMonth', right_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.duplicated(subset=['PERMNO','YearMonth']).sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Final Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ratio_chars = ['CAPEI', 'bm',\n",
    "       'evm', 'pe_exi', 'pe_inc', 'ps', 'pcf',\n",
    "       'dpr', 'npm', 'opmbd', 'opmad', 'gpm', 'ptpm', 'cfm', 'roa', 'roe',\n",
    "       'roce', 'efftax', 'aftret_eq', 'aftret_invcapx', 'aftret_equity',\n",
    "       'pretret_noa', 'pretret_earnat', 'GProf', 'equity_invcap',\n",
    "       'debt_invcap', 'totdebt_invcap', 'capital_ratio', 'int_debt',\n",
    "       'int_totdebt', 'cash_lt', 'invt_act', 'rect_act', 'debt_at',\n",
    "       'debt_ebitda', 'short_debt', 'curr_debt', 'lt_debt', 'profit_lct',\n",
    "       'ocf_lct', 'cash_debt', 'fcf_ocf', 'lt_ppent', 'dltt_be', 'debt_assets',\n",
    "       'debt_capital', 'de_ratio', 'intcov', 'intcov_ratio', 'cash_ratio',\n",
    "       'quick_ratio', 'curr_ratio', 'cash_conversion', 'inv_turn', 'at_turn',\n",
    "       'rect_turn', 'pay_turn', 'sale_invcap', 'sale_equity', 'sale_nwc',\n",
    "       'rd_sale', 'adv_sale', 'staff_sale', 'accrual', 'ptb', 'PEG_trailing',\n",
    "       'divyield']\n",
    "\n",
    "per_share_chars = ['dividend_p','BE_p','Liability_p','cur_liability_p','LT_debt_p',\n",
    "                  'cash_p', 'total_asset_p', 'tot_debt_p', 'accrual_p', 'EBIT_p',\n",
    "                   'cur_asset_p', 'pbda_p', 'ocf_p', 'inventory_p', 'receivables_p',\n",
    "                   'Cur_debt_p', 'interest_p', 'fcf_ocf_p', 'evm_p',\n",
    "                   'sales_p', 'invcap_p', 'c_equity_p', 'rd_p', 'opmad_p', 'gpm_p','ptpm_p'\n",
    "                  ]\n",
    "\n",
    "macro_chars = ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "\n",
    "fundamental_chars = ['ret', 'prc',\n",
    "                    'EPS_true_l1_q1','EPS_true_l1_q2','EPS_true_l1_q3',\n",
    "                    'EPS_true_l1_y1','EPS_true_l1_y2',\n",
    "                    ]\n",
    "\n",
    "analyst_chars = ['EPS_ana_q1','EPS_ana_q2','EPS_ana_q3','EPS_ana_y1','EPS_ana_y2']\n",
    "\n",
    "targets = ['EPS_true_q1', 'EPS_true_q2', 'EPS_true_q3', 'EPS_true_y1', 'EPS_true_y2']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Lag one month information ###\n",
    "### Except for analyst forecasts\n",
    "df.sort_values(by=['PERMNO', 'YearMonth'], inplace=True)\n",
    "vars_lag = ratio_chars + per_share_chars + macro_chars + fundamental_chars\n",
    "df[vars_lag] = df.groupby('PERMNO')[vars_lag].shift(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ## FillNA with Industry Median\n",
    "fillNA = ratio_chars + per_share_chars + fundamental_chars\n",
    "for v in tqdm(fillNA):\n",
    "    df[v] = df.groupby(['YearMonth','fama49'], group_keys=False)[v].apply(lambda x: x.fillna(x.median()))\n",
    "## In case some characteristics are all NA in some industry\n",
    "for v in tqdm(fillNA + macro_chars):\n",
    "    df[v] = df.groupby(['YearMonth'], group_keys=False)[v].apply(lambda x: x.fillna(x.median()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_tmp = df[(df['YearMonth'] >= '1984-01-01') & (df['YearMonth'] <= '2019-12-31')].reset_index(drop=True).copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# winsorization period-by-period\n",
    "cols = ratio_chars + per_share_chars + fundamental_chars + analyst_chars + targets\n",
    "df_tmp[cols] = df_tmp.groupby('YearMonth',group_keys=False)[cols]\\\n",
    "                             .transform(lambda x: x.clip(x.quantile(0.01),x.quantile(0.99)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_tmp.rename(columns={'PERMNO':'permno'}, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_tmp.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Our working data\n",
    "df_tmp.to_parquet('../data/Results/df_train_new.parquet')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "man_versus_machine",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
